Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(ggplot2)
library(ggmap)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggmap':
##
## wind
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Load the dataset
url <- "https://raw.githubusercontent.com/cema-uonbi/internship_task/main/data/cema_internship_task_2023.csv"
data <- read.csv(url)
head(data)
## period county Total.Dewormed Acute.Malnutrition
## 1 Jan-23 Baringo County 3659 8
## 2 Jan-23 Bomet County 1580 NA
## 3 Jan-23 Bungoma County 6590 24
## 4 Jan-23 Busia County 7564 NA
## 5 Jan-23 Elgeyo Marakwet County 1407 NA
## 6 Jan-23 Embu County 3241 72
## stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
## 1 471 34 380 2620
## 2 1 3 NA 1984
## 3 98 154 23 4576
## 4 396 143 111 2239
## 5 92 71 5 2739
## 6 326 86 24 1376
## Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
## 1 85 739 731
## 2 41 86 16
## 3 231 315 120
## 4 251 608 125
## 5 57 104 21
## 6 141 544 160
Explore the structure and summary statistics of the dataset
# Inspect data
glimpse(data)
## Rows: 1,410
## Columns: 11
## $ period <chr> "Jan-23", "Jan-23", "Jan-23", "Jan-23", "Jan-…
## $ county <chr> "Baringo County", "Bomet County", "Bungoma Co…
## $ Total.Dewormed <int> 3659, 1580, 6590, 7564, 1407, 3241, 6751, 469…
## $ Acute.Malnutrition <int> 8, NA, 24, NA, NA, 72, 250, 9, 26, 104, 36, N…
## $ stunted.6.23.months <int> 471, 1, 98, 396, 92, 326, 40, 209, 51, 319, 2…
## $ stunted.0..6.months <int> 34, 3, 154, 143, 71, 86, 13, 87, 6, 102, 279,…
## $ stunted.24.59.months <int> 380, NA, 23, 111, 5, 24, 99, 58, 50, 155, 292…
## $ diarrhoea.cases <int> 2620, 1984, 4576, 2239, 2739, 1376, 2314, 278…
## $ Underweight.0..6.months <int> 85, 41, 231, 251, 57, 141, 223, 140, 13, 139,…
## $ Underweight.6.23.months <dbl> 739, 86, 315, 608, 104, 544, 1856, 298, 180, …
## $ Underweight.24.59.Months <dbl> 731, 16, 120, 125, 21, 160, 1833, 84, 271, 57…
summary(data)
## period county Total.Dewormed Acute.Malnutrition
## Length:1410 Length:1410 Min. : 97 Min. : 1.0
## Class :character Class :character 1st Qu.: 2454 1st Qu.: 15.0
## Mode :character Mode :character Median : 4564 Median : 39.0
## Mean : 11458 Mean : 125.4
## 3rd Qu.: 8222 3rd Qu.: 143.5
## Max. :392800 Max. :4123.0
## NA's :355
## stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
## Min. : 1.0 Min. : 1.0 Min. : 1.0 Min. : 198
## 1st Qu.: 69.5 1st Qu.: 36.5 1st Qu.: 22.0 1st Qu.: 1464
## Median : 159.0 Median : 84.0 Median : 50.0 Median : 2158
## Mean : 280.2 Mean : 139.8 Mean : 110.8 Mean : 2813
## 3rd Qu.: 328.5 3rd Qu.: 157.0 3rd Qu.: 114.2 3rd Qu.: 3335
## Max. :4398.0 Max. :7900.0 Max. :3169.0 Max. :15795
## NA's :11 NA's :19 NA's :14
## Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
## Min. : 6.0 Min. : 16.0 Min. : 1.00
## 1st Qu.: 87.0 1st Qu.: 249.0 1st Qu.: 51.25
## Median : 162.5 Median : 456.0 Median : 120.50
## Mean : 223.5 Mean : 652.3 Mean : 305.74
## 3rd Qu.: 272.8 3rd Qu.: 791.8 3rd Qu.: 311.00
## Max. :1937.0 Max. :5348.0 Max. :4680.00
##
Handling missing value
# Fill in missing values for the columns with their mean
data <- data %>%
mutate(
Total.Dewormed = ifelse(is.na(Total.Dewormed), mean(Total.Dewormed, na.rm = TRUE), Total.Dewormed),
Acute.Malnutrition = ifelse(is.na(Acute.Malnutrition), mean(Acute.Malnutrition, na.rm = TRUE), Acute.Malnutrition),
stunted.6.23.months = ifelse(is.na(stunted.6.23.months), mean(stunted.6.23.months, na.rm = TRUE), stunted.6.23.months),
stunted.0..6.months = ifelse(is.na(stunted.0..6.months), mean(stunted.0..6.months, na.rm = TRUE), stunted.0..6.months),
stunted.24.59.months = ifelse(is.na(stunted.24.59.months), mean(stunted.24.59.months, na.rm = TRUE), stunted.24.59.months),
diarrhoea.cases = ifelse(is.na(diarrhoea.cases), mean(diarrhoea.cases, na.rm = TRUE), diarrhoea.cases),
Underweight.0..6.months = ifelse(is.na(Underweight.0..6.months), mean(Underweight.0..6.months, na.rm = TRUE), Underweight.0..6.months),
Underweight.6.23.months = ifelse(is.na(Underweight.6.23.months), mean(Underweight.6.23.months, na.rm = TRUE), Underweight.6.23.months),
Underweight.24.59.Months = ifelse(is.na(Underweight.24.59.Months), mean(Underweight.24.59.Months, na.rm = TRUE), Underweight.24.59.Months)
)
# Inspect the filled data
glimpse(data)
## Rows: 1,410
## Columns: 11
## $ period <chr> "Jan-23", "Jan-23", "Jan-23", "Jan-23", "Jan-…
## $ county <chr> "Baringo County", "Bomet County", "Bungoma Co…
## $ Total.Dewormed <int> 3659, 1580, 6590, 7564, 1407, 3241, 6751, 469…
## $ Acute.Malnutrition <dbl> 8.0, 125.4, 24.0, 125.4, 125.4, 72.0, 250.0, …
## $ stunted.6.23.months <dbl> 471, 1, 98, 396, 92, 326, 40, 209, 51, 319, 2…
## $ stunted.0..6.months <dbl> 34, 3, 154, 143, 71, 86, 13, 87, 6, 102, 279,…
## $ stunted.24.59.months <dbl> 380.000, 110.765, 23.000, 111.000, 5.000, 24.…
## $ diarrhoea.cases <int> 2620, 1984, 4576, 2239, 2739, 1376, 2314, 278…
## $ Underweight.0..6.months <int> 85, 41, 231, 251, 57, 141, 223, 140, 13, 139,…
## $ Underweight.6.23.months <dbl> 739, 86, 315, 608, 104, 544, 1856, 298, 180, …
## $ Underweight.24.59.Months <dbl> 731, 16, 120, 125, 21, 160, 1833, 84, 271, 57…
Visualize with histograms
Create interactive histograms using plotly
p <- ggplot(data_tidy, aes(x = value)) +
geom_histogram() +
facet_wrap(~variable, scales = "free") +
theme_minimal()
ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Visualize with boxplots
Create interactive boxplots using plotly
p <- ggplot(data_tidy, aes(x = period, y = value, color = variable)) +
geom_boxplot() +
theme_minimal()+
coord_flip()
ggplotly(p)
Join with spatial data
kenya_sf <- st_read("County.shp")
## Reading layer `County' from data source
## `C:\Users\STREET_CODER\Documents\CEMA-Intership\County.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 47 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 33.91028 ymin: -4.798828 xmax: 41.90613 ymax: 5.414124
## Geodetic CRS: WGS 84
data$county <- gsub(" County$", "", data$county)
data_sf <- left_join(kenya_sf, data, by = c("Name" = "county"))
Map acute malnutrition
Create an interactive choropleth map using plotly
State research question
“What factors are associated with high rates of acute malnutrition
across counties?”
Statistical analysis e.g. correlation, regression
Calculate correlation between Acute Malnutrition and Dewormed
correlation <- cor(data_sf$`Acute.Malnutrition`, data_sf$Total.Dewormed)
print("Correlation coefficient between Acute Malnutrition and Dewormed:")
## [1] "Correlation coefficient between Acute Malnutrition and Dewormed:"
print(correlation)
## [1] 0.07208518
Fit a linear regression model to explore the relationship between
Acute Malnutrition and Dewormed
linear_model <- lm(Acute.Malnutrition ~ Total.Dewormed, data = data_sf)
summary(linear_model)
##
## Call:
## lm(formula = Acute.Malnutrition ~ Total.Dewormed, data = data_sf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -213.3 -102.3 -36.7 6.6 3983.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.179e+02 6.720e+00 17.544 < 2e-16 ***
## Total.Dewormed 6.548e-04 2.415e-04 2.712 0.00677 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 230 on 1408 degrees of freedom
## Multiple R-squared: 0.005196, Adjusted R-squared: 0.00449
## F-statistic: 7.355 on 1 and 1408 DF, p-value: 0.006771
Graphs showing the relationship between Acute Malnutrition and
Dewormed
Create interactive scatter plot with a regression line using
plotly
p <- ggplot(data_sf, aes(x = Total.Dewormed, y = Acute.Malnutrition)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Dewormed", y = "Acute Malnutrition") +
theme_minimal()
ggplotly(p)
## `geom_smooth()` using formula = 'y ~ x'
Create interactive box plot using plotly
p <- ggplot(data_sf, aes(x = Total.Dewormed, y = Acute.Malnutrition)) +
geom_boxplot() +
labs(x = "Dewormed", y = "Acute Malnutrition") +
theme_minimal()
ggplotly(p)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?